{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "85\n",
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index26.htm', 'index27.htm', 'index28.htm', 'index29.htm', 'index3.htm', 'index30.htm', 'index31.htm', 'index32.htm', 'index33.htm', 'index34.htm', 'index35.htm', 'index36.htm', 'index37.htm', 'index38.htm', 'index39.htm', 'index4.htm', 'index40.htm', 'index41.htm', 'index42.htm', 'index43.htm', 'index44.htm', 'index45.htm', 'index46.htm', 'index47.htm', 'index48.htm', 'index49.htm', 'index5.htm', 'index50.htm', 'index51.htm', 'index52.htm', 'index53.htm', 'index54.htm', 'index55.htm', 'index56.htm', 'index57.htm', 'index58.htm', 'index59.htm', 'index6.htm', 'index60.htm', 'index61.htm', 'index62.htm', 'index63.htm', 'index64.htm', 'index65.htm', 'index66.htm', 'index67.htm', 'index68.htm', 'index69.htm', 'index7.htm', 'index70.htm', 'index71.htm', 'index72.htm', 'index73.htm', 'index74.htm', 'index75.htm', 'index76.htm', 'index77.htm', 'index78.htm', 'index79.htm', 'index8.htm', 'index80.htm', 'index81.htm', 'index82.htm', 'index83.htm', 'index84.htm', 'index85.htm', 'index86.htm', 'index87.htm', 'index88.htm', 'index89.htm', 'index9.htm', 'index90.htm', 'index91.htm', 'index92.htm', 'index93.htm', 'index94.htm', 'index95.htm', 'index96.htm', 'index97.htm', 'index98.htm', 'index99.htm']\n"
     ]
    },
    {
     "data": {
      "text/plain": "      index                                   标题  \\\n0         0        商学院电子商务专业召开申请调整学位授予学科门类 专家评审会   \n2         2                       会计学院大一年级大会顺利召开   \n1         1              广州新华学院会计学院刘运国院长一行莅临我院访问   \n3         3                  【国奖映像】蒋晓琳：明确目标，为之努力   \n5         5                      励能计划2021：你选哪一项？   \n...     ...                                  ...   \n1657     17                文学与传媒系2016年学术研讨会议成功举办   \n1660      0  严谨为学，诚信迎考--工商管理系班级期末总结暨诚信考试动员大会圆满结束   \n1659     19                  经济学与商务管理系顺利召开办公培训会议   \n1661      1         以学生为本，做优秀学生干部——我院“青马工程”第二讲举行   \n1662      2       经济学与商务管理系党总支第17期入党积极分子实践活动圆满结束   \n\n                                                     链结          日期  \n0     https://www.nfu.edu.cn/xydt/cf4420785b9046e998...  2021-04-07  \n2     https://www.nfu.edu.cn/xydt/16f4c5f4bd284caebf...  2021-04-06  \n1     https://www.nfu.edu.cn/xydt/935f580040704990a4...  2021-04-06  \n3     https://www.nfu.edu.cn/xydt/9611d110ec8a486587...  2021-04-02  \n5     https://www.nfu.edu.cn/xydt/8b2414ee7cca45d88c...  2021-04-02  \n...                                                 ...         ...  \n1657  https://www.nfu.edu.cn/xydt/ae2cf68f6b1247a6a1...  2016-01-08  \n1660  https://www.nfu.edu.cn/xydt/3d02b255690e4ce796...  2016-01-04  \n1659  https://www.nfu.edu.cn/xydt/9b59863e5051412d80...  2016-01-04  \n1661  https://www.nfu.edu.cn/xydt/0132532dbb0e448d82...  2015-10-30  \n1662  https://www.nfu.edu.cn/xydt/2d20a911787b4cc09c...  2015-03-28  \n\n[1683 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>index</th>\n      <th>标题</th>\n      <th>链结</th>\n      <th>日期</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>商学院电子商务专业召开申请调整学位授予学科门类 专家评审会</td>\n      <td>https://www.nfu.edu.cn/xydt/cf4420785b9046e998...</td>\n      <td>2021-04-07</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>会计学院大一年级大会顺利召开</td>\n      <td>https://www.nfu.edu.cn/xydt/16f4c5f4bd284caebf...</td>\n      <td>2021-04-06</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>广州新华学院会计学院刘运国院长一行莅临我院访问</td>\n      <td>https://www.nfu.edu.cn/xydt/935f580040704990a4...</td>\n      <td>2021-04-06</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>【国奖映像】蒋晓琳：明确目标，为之努力</td>\n      <td>https://www.nfu.edu.cn/xydt/9611d110ec8a486587...</td>\n      <td>2021-04-02</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>5</td>\n      <td>励能计划2021：你选哪一项？</td>\n      <td>https://www.nfu.edu.cn/xydt/8b2414ee7cca45d88c...</td>\n      <td>2021-04-02</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1657</th>\n      <td>17</td>\n      <td>文学与传媒系2016年学术研讨会议成功举办</td>\n      <td>https://www.nfu.edu.cn/xydt/ae2cf68f6b1247a6a1...</td>\n      <td>2016-01-08</td>\n    </tr>\n    <tr>\n      <th>1660</th>\n      <td>0</td>\n      <td>严谨为学，诚信迎考--工商管理系班级期末总结暨诚信考试动员大会圆满结束</td>\n      <td>https://www.nfu.edu.cn/xydt/3d02b255690e4ce796...</td>\n      <td>2016-01-04</td>\n    </tr>\n    <tr>\n      <th>1659</th>\n      <td>19</td>\n      <td>经济学与商务管理系顺利召开办公培训会议</td>\n      <td>https://www.nfu.edu.cn/xydt/9b59863e5051412d80...</td>\n      <td>2016-01-04</td>\n    </tr>\n    <tr>\n      <th>1661</th>\n      <td>1</td>\n      <td>以学生为本，做优秀学生干部——我院“青马工程”第二讲举行</td>\n      <td>https://www.nfu.edu.cn/xydt/0132532dbb0e448d82...</td>\n      <td>2015-10-30</td>\n    </tr>\n    <tr>\n      <th>1662</th>\n      <td>2</td>\n      <td>经济学与商务管理系党总支第17期入党积极分子实践活动圆满结束</td>\n      <td>https://www.nfu.edu.cn/xydt/2d20a911787b4cc09c...</td>\n      <td>2015-03-28</td>\n    </tr>\n  </tbody>\n</table>\n<p>1683 rows × 4 columns</p>\n</div>"
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "\n",
    "\n",
    "\n",
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse\n",
    "\n",
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/xydt/index.htm\")\n",
    "\n",
    "\n",
    "with open (\"html_out/_nfu_校园动态.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)\n",
    "with open (\"html_out/_nfu_校园动态.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()\n",
    "\n",
    "parsed = requests_html.soup_parse(html_load)\n",
    "parsed\n",
    "\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse\n",
    "\n",
    "\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL\n",
    "\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df\n",
    "\n",
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/xydt/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break\n",
    "\n",
    "url_group = ['https://www.nfu.edu.cn/xydt/index'+str(i)+'.htm' for i in range(1,100)]\n",
    "url_group\n",
    "\n",
    "url_group.insert(0,'https://www.nfu.edu.cn/xydt/index.htm')\n",
    "url_group\n",
    "urllib.parse.urlparse(url_group[0]).path\n",
    "\n",
    "\n",
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)\n",
    "\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}\n",
    "\n",
    "\n",
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL\n",
    "\n",
    "\n",
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/xydt/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/xydt/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "\n",
    "        df1= pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df1)\n",
    "\n",
    "\n",
    "\n",
    "df_all1 = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all1)\n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_校园动态.xlsx',mode='w',engine=\"openpyxl\") as writer:\n",
    "            df_all1.to_excel(writer, sheet_name='学校要闻')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}